################################################
# Replication code for                         #
# Inaccuracies in Low Income Housing Geocodes: #
# When and Why They Matter                     #
################################################

## Load necessary libraries (uncomment install if needed)

#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("readxl")
#install.packages("sf")
#install.packages("ggspatial")

library(dplyr)
library(ggplot2)
library(readxl)
library(sf)
library(ggspatial)

## Set wd to location of data
setwd("~/Dropbox (MIT)/lihtc_RA/replication")

# load data
desc <- read.csv("lihtc_desc.csv") # data with descriptive variables 
#check_ca <- read.csv("lihtc_checked_ca.csv") # hand-checked CA data (can't upload full data to comply with Google TOS)
#check_nat <- read.csv("lihtc_checked_nat.csv") # hand-checked national data (can't upload full data to comply with Google TOS)
check_ca <- read.csv("lihtc_checked_ca_filt.csv") 
check_nat <- read.csv("lihtc_checked_nat_filt.csv") 
zcta <- st_read("tl_2010_us_zcta510/tl_2010_us_zcta510.shp") #Census ZIP Code Tabulation Areas
rural <- read_excel("full-ZCTA-urban-suburban-rural-classification.xlsx", sheet = 1) #From http://jedkolko.com/datasets/
bg <- st_read("tl_2010_06_bg10/tl_2010_06_bg10.shp") 
bl <- st_read("tl_2010_06_tabblock10/tl_2010_06_tabblock10.shp")


#### MAIN TEXT ####

#### Comparison of HUD and Google Coordinates ####

## HUD and Google accuracy 
prop.table(table(check_ca$hud_accurate))
prop.table(table(check_ca$google_accurate))

## Google more accurate than HUD
prop.table(table(check_ca$google_better))

check_ca %>% 
  group_by(hud_accurate)%>% 
  summarise(mean(google_better))

## Table 1: HUD and Google accuracy 

tab_1 <- prop.table(table(check_ca$hud_accurate, check_ca$google_accurate))
names(dimnames(tab_1)) <- c("HUD Accurate", "Google Accurate")
round(tab_1*100,1)

## Google better when HUD accurate
check_ca %>%
  filter(hud_accurate==1)%>%
  summarise(mean(google_better))

## Recording new coordinates
check_ca %>%
  filter(hud_accurate==1)%>%
  summarise(mean(use_neither))

check_ca %>%
  filter(google_accurate==1)%>%
  summarise(mean(use_neither))

#### Degree of Inaccuracies ####

## Distance between HUD/Google and correct coordinate
summary(check_ca$distance_hud_correct) # Median = 70, mean = 153
summary(check_ca$distance_google_correct) # Median = 0, mean = 136

## Distance between HUD/Google and correct coordinate -- when incorrect
summary(check_ca$distance_hud_correct[which(check_ca$hud_accurate==0)]) # Median = 90, mean = 271
summary(check_ca$distance_google_correct[which(check_ca$google_accurate==0)]) # Median = 178, mean = 2140

## Figure 1: HUD and Google distance

distance.df <- data.frame(distance_off = as.numeric(c(check_ca$distance_hud_correct[which(check_ca$hud_accurate==0)],
                                                      check_ca$distance_google_correct[which(check_ca$google_accurate==0)])),
                          coord_type = c(rep("HUD", length(which(check_ca$hud_accurate==0))),
                                         rep("Google", length(which(check_ca$google_accurate==0)))))

ggplot(distance.df, aes(x=coord_type, y=distance_off)) + 
  geom_boxplot()+
  stat_summary(fun="mean",color="gray", shape=13)+
  theme_classic()+
  xlab("")+
  ylab("Distance from correct location\n(in meters)")+
  coord_flip(ylim = c(0, 5100))+
  theme(plot.title = element_text(hjust = 0.5),plot.caption = element_text(hjust = 0))

#### Replication with National Data ####

## Table 2: HUD and Google accuracy (national sample)

tab_2 <- prop.table(table(check_nat$hud_accurate, check_nat$google_accurate))
names(dimnames(tab_2)) <- c("HUD Accurate", "Google Accurate")
tab_2

#### Implications ####

bg <- st_transform(bg, 4326)
bl <- st_transform(bl, 4326)

hud_sf <- st_as_sf(x = check_ca,
                   coords = c("hud_lon", "hud_lat"),
                   crs = 4326)

google_sf <- st_as_sf(x = check_ca,
                      coords = c("google_lon", "google_lat"),
                      crs = 4326)

use_sf <- st_as_sf(x = check_ca,
                   coords = c("use_lon", "use_lat"),
                   crs = 4326)

# California should have 8,057 census tracts; 23,212 block groups; 710,145 blocks.

# define unique tracts
bg$unique_tract <- paste0(bg$COUNTYFP10, bg$TRACTCE10)
bl$unique_tract <- paste0(bl$COUNTYFP10, bl$TRACTCE10)
length(unique(bg$unique_tract)) #8057
length(unique(bl$unique_tract)) #8057

# define unique block groups
bg$unique_bg <- paste0(bg$unique_tract,bg$BLKGRPCE10)
length(unique(bg$unique_bg)) #23212

# define unique blocks
bl$unique_block <- paste0(bl$unique_tract, bl$BLOCKCE10)
length(unique(bl$unique_block)) #710145

# place google, hud and correct coordinates into block groups
hud_in_bg <- st_join(hud_sf, bg, join = st_within)
google_in_bg <- st_join(google_sf, bg, join = st_within)
correct_in_bg <- st_join(use_sf, bg, join = st_within)

# place google, hud and correct coordinates into blocks
hud_in_block <- st_join(hud_sf, bl, join = st_within)
google_in_block <- st_join(google_sf, bl, join = st_within)
correct_in_block <- st_join(use_sf, bl, join = st_within)


# Google in correct tract 98.1% of time 
prop.table(table(google_in_bg$unique_tract==correct_in_bg$unique_tract))
# HUD in correct tract 96.7% of time 
prop.table(table(hud_in_bg$unique_tract==correct_in_bg$unique_tract))
# Google in correct block group 97.8% of time 
prop.table(table(google_in_bg$unique_bg==correct_in_bg$unique_bg))
# HUD in correct block group 94.5% of time 
prop.table(table(hud_in_bg$unique_bg==correct_in_bg$unique_bg))
# Google in correct block 94.1% of the time
prop.table(table(google_in_block$unique_block==correct_in_block$unique_block))
# HUD in correct block 80.6% of the time
prop.table(table(hud_in_block$unique_block==correct_in_block$unique_block))

# illustrate with one tract
tract_error <- bl %>%
  filter(unique_tract=="085503112")%>%
  dplyr::select(TRACTCE10, BLOCKCE10)

ggplot(data = tract_error, aes(color=hud_id)) +
  geom_sf(fill= "white", color="black")+
  theme_void()+
  geom_sf(data = correct_in_block[which(correct_in_block$unique_tract=="085503112"),], shape = 21, aes(fill=hud_id), color="black", size=4.5, stroke=1)+ 
  theme(axis.text.x=element_blank(), #remove x axis labels
        axis.ticks.x=element_blank(), #remove x axis ticks
        axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
  ) + 
  geom_sf(data = google_in_block[which(google_in_block$unique_tract=="085503112"),], aes(shape="Google",color=hud_id), size=5.5, stroke=2) +  #CAA20060055
  geom_sf(data = hud_in_block[which(hud_in_block$unique_tract=="085503112"),],
          aes(shape = "HUD", color=hud_id), size = 5.5, stroke = 2) +
  scale_fill_manual(values = c("white","gray44","black"), labels = c("Facility 1", "Facility 2", "Facility 3"), name = "Correct location\nof LIHTC facility")+ #changed to white+
  scale_color_manual(values = c("gray44","black"), name = "HUD ID", guide="none")+ #changed to white+
  scale_shape_manual(values = c("Google"=3, "HUD"=4), name="")+ #changed from 20
  ggspatial::annotation_scale()+
  guides(fill = guide_legend(order = 1), 
         shape = guide_legend(order = 2))

#### Appendix ####

## Summary statistics for Table A.1
summary(desc)
summary(desc[which(desc$CA_sample==1),])
summary(desc[which(desc$national_sample==1),])

## Figure A.1
lihtc_merged <- left_join(check_ca, desc, by="hud_id")

ggplot(lihtc_merged, aes(yr_pis,hud_accurate)) +
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Year Placed in Service")

## Figure A.2

ggplot(lihtc_merged, aes(allocamt/1000, hud_accurate))+
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Allocated Amount (Thousands USD Yearly)")

## Figure A.3

ggplot(lihtc_merged, aes(n_units, hud_accurate))+
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Number of Units (Total)")

## Figure A.4

ggplot(lihtc_merged, aes(li_units, hud_accurate))+
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Number of Low-Income Units")


## Population density 
zcta <- st_transform(zcta, crs = 4326)
head(zcta)

ca_sf <- st_as_sf(check_ca, coords = c("use_lon", "use_lat"), crs = 4326) 
nat_sf <- st_as_sf(check_nat, coords = c("use_lon", "use_lat"), crs = 4326) 

ca_zcta <- st_join(ca_sf, zcta, join = st_within)
nat_zcta <- st_join(nat_sf, zcta, join = st_within)

rural$ZIP_CODE <- sprintf("%05d", rural$ZCTA)
table(rural$classification)

rural$rural <- ifelse(rural$classification == 3, 1, 0)

table(rural$rural)

ca_zcta_rural <- merge(ca_zcta, rural, by.x = "ZCTA5CE10", by.y = "ZIP_CODE")

ca_zcta_rural$distance_hud_correct <- as.vector(ca_zcta_rural$distance_hud_correct)
ca_zcta_rural$distance_google_correct <- as.vector(ca_zcta_rural$distance_google_correct)

summary(lm(hud_accurate ~ rural, ca_zcta_rural)) 
summary(lm(hud_accurate ~ log(density), ca_zcta_rural)) 

summary(lm(log(distance_hud_correct) ~ rural, subset(ca_zcta_rural, hud_accurate == 0))) 
summary(lm(log(distance_hud_correct) ~ log(density), subset(ca_zcta_rural, distance_hud_correct>0 &hud_accurate == 0))) 

summary(lm(google_accurate ~ rural, ca_zcta_rural)) 
summary(lm(google_accurate ~ log(density), ca_zcta_rural)) 

summary(lm(log(distance_google_correct) ~ rural, subset(ca_zcta_rural, google_accurate == 0))) # not statistically significant
summary(lm(log(distance_google_correct) ~ log(density), subset(ca_zcta_rural, distance_google_correct >0 &google_accurate == 0))) # not statistically significant

# national data
nat_zcta_rural <- merge(nat_zcta, rural, by.x = "ZCTA5CE10", by.y = "ZIP_CODE")
nat_zcta_rural$distance_hud_correct <- as.vector(nat_zcta_rural$distance_hud_correct)
nat_zcta_rural$distance_google_correct <- as.vector(nat_zcta_rural$distance_google_correct)

summary(lm(hud_accurate ~ rural, nat_zcta_rural)) 
summary(lm(hud_accurate ~ log(density), nat_zcta_rural)) 

summary(lm(log(distance_hud_correct) ~ rural, subset(nat_zcta_rural,distance_hud_correct > 0 & hud_accurate == 0))) 
summary(lm(log(distance_hud_correct) ~ log(density), subset(nat_zcta_rural,distance_hud_correct > 0 & hud_accurate == 0))) 

summary(nat_zcta_rural$density)
summary(subset(nat_zcta_rural, hud_accurate == 0)$distance_hud_correct)

summary(lm(google_accurate ~ rural, nat_zcta_rural)) 
summary(lm(google_accurate ~ log(density), nat_zcta_rural)) 

summary(lm(log(distance_google_correct) ~ rural, subset(nat_zcta_rural, google_accurate == 0))) 
summary(lm(log(distance_google_correct) ~ log(density), subset(nat_zcta_rural,distance_google_correct>0 & google_accurate == 0))) 

## Figure A.5

ggplot(ca_zcta_rural, aes(log(density), hud_accurate))+
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Population Density (logged), California Sample")

## Figure A.6

ggplot(nat_zcta_rural, aes(log(density), hud_accurate))+
  geom_jitter()+
  geom_smooth()+
  theme_bw()+
  scale_y_continuous(breaks = c(0,1), limits=c(-.25, 1.25),
                     labels = c("No","Yes"), name="HUD Accurate")+
  xlab("Population Density (logged), National Sample")

## Figure A.7

ggplot(subset(ca_zcta_rural, distance_hud_correct>0 & hud_accurate == 0), aes(log(density), log(distance_hud_correct)))+
  # geom_jitter(height = .1)+
  geom_point() + 
  geom_smooth()+
  theme_bw() +
  ylab("HUD Geocode Error Distance (Meters, Logged)") + 
  xlab("Population Density (Logged), California Sample")

## Figure A.8

ggplot(subset(nat_zcta_rural, distance_hud_correct>0 & hud_accurate == 0), aes(log(density), log(distance_hud_correct)))+
  geom_point() +
  geom_smooth()+
  theme_bw() +
  ylab("HUD Geocode Error Distance (Meters, Logged)") + 
  xlab("Population Density (Logged), National Sample")


## Figure B.1

## accuracy rates within certain bands
summary(check_ca$distance)

check_ca$distance_quartile <- ifelse(check_ca$distance >= 35 & check_ca$distance  < 52.67, 1,
                                  ifelse(check_ca$distance >= 52.67 & check_ca$distance < 76.15, 2,
                                         ifelse(check_ca$distance >= 76.15 & check_ca$distance < 121.77, 3,
                                                ifelse(check_ca$distance >=121.77, 4, NA))))

accuracy_quartile <- check_ca %>%
  group_by(distance_quartile)%>%
  dplyr::summarise(hud_acc = mean(hud_accurate, na.rm = T))

accuracy_quartile

ggplot(accuracy_quartile, aes(x=distance_quartile, y=hud_acc))+
  geom_bar(stat="identity")+
  theme_classic() +
  ylab("HUD Accuracy Rate")+
  ylim(0,1)+
  xlab("Distance Quartile")

## Table B.1

check_ca$directional <- ifelse(grepl(" N ", check_ca$full_addr) |
                                     grepl(" S ", check_ca$full_addr) |
                                     grepl(" E ", check_ca$full_addr) |
                                     grepl(" W ", check_ca$full_addr), TRUE, FALSE)

check_ca %>% 
  group_by(directional)%>%
  summarise(google_acc = mean(google_accurate, na.rm=T),
            hud_acc = mean(hud_accurate, na.rm=T))





